Applying what you’ve learned, create an economics- or social-related plot that is polished with the appropriate annotations, aesthetics and some simple commentary.
To fulfill required task, I use “Brazilian E-Commerce Public Dataset by Olist” from kaggle.com. This data is a public dataset provided by Olist Store, a Brazilian ecommerce. The dataset contains 100k orders information from 2016 to 2018 at multiple marketplaces in Brazil.
Picture below explain relationship of in the dataset
# Read geolocation dataset
olist_geolocation_dataset <- read.csv("data_input/olist_geolocation_dataset.csv")
# Read customers dataset
olist_customers_dataset <- read.csv("data_input/olist_customers_dataset.csv")
# Read order items dataset
olist_order_items_dataset<- read.csv("data_input/olist_order_items_dataset.csv")
# Read order payment dataset
olist_order_payments_dataset <- read.csv("data_input/olist_order_payments_dataset.csv")
# Read review dataset
olist_order_reviews_dataset <- read.csv("data_input/olist_order_reviews_dataset.csv")
# Read orders dataset
olist_orders_dataset <- read.csv("data_input/olist_orders_dataset.csv")
# Read products dataset
olist_products_dataset <- read.csv("data_input/olist_products_dataset.csv")
# Read sellers dataset
olist_sellers_dataset <- read.csv("data_input/olist_sellers_dataset.csv")
# Read product category translation Brazil-English dataset
product_category_name_translation <- read.csv("data_input/product_category_name_translation.csv")# manipulate_customers_data
olist_customers_dataset$customer_id <- as.character(olist_customers_dataset$customer_id)
olist_customers_dataset$customer_unique_id <- as.character(olist_customers_dataset$customer_unique_id)
olist_customers_dataset$customer_zip_code_prefix <- as.factor(olist_customers_dataset$customer_zip_code_prefix)
# check product dataset name information
summary(olist_products_dataset)## product_id product_category_name
## 00066f42aeeb9f3007548bb9d3f33c38: 1 cama_mesa_banho : 3029
## 00088930e925c41fd95ebfe695fd2655: 1 esporte_lazer : 2867
## 0009406fd7479715e4bef61dd91f2462: 1 moveis_decoracao : 2657
## 000b8f95fcb9e0096488278317764d19: 1 beleza_saude : 2444
## 000d9be29b5207b54e86aa1b1ac54872: 1 utilidades_domesticas: 2335
## 0011c512eb256aa0dbbb544d8dffcf6e: 1 automotivo : 1900
## (Other) :32945 (Other) :17719
## product_name_lenght product_description_lenght product_photos_qty
## Min. : 5.00 Min. : 4.0 Min. : 1.000
## 1st Qu.:42.00 1st Qu.: 339.0 1st Qu.: 1.000
## Median :51.00 Median : 595.0 Median : 1.000
## Mean :48.48 Mean : 771.5 Mean : 2.189
## 3rd Qu.:57.00 3rd Qu.: 972.0 3rd Qu.: 3.000
## Max. :76.00 Max. :3992.0 Max. :20.000
## NA's :610 NA's :610 NA's :610
## product_weight_g product_length_cm product_height_cm product_width_cm
## Min. : 0 Min. : 7.00 Min. : 2.00 Min. : 6.0
## 1st Qu.: 300 1st Qu.: 18.00 1st Qu.: 8.00 1st Qu.: 15.0
## Median : 700 Median : 25.00 Median : 13.00 Median : 20.0
## Mean : 2276 Mean : 30.82 Mean : 16.94 Mean : 23.2
## 3rd Qu.: 1900 3rd Qu.: 38.00 3rd Qu.: 21.00 3rd Qu.: 30.0
## Max. :40425 Max. :105.00 Max. :105.00 Max. :118.0
## NA's :2 NA's :2 NA's :2 NA's :2
# check product category name translation information structure
str(product_category_name_translation)## 'data.frame': 71 obs. of 2 variables:
## $ product_category_name : Factor w/ 71 levels "agro_industria_e_comercio",..: 12 45 9 14 55 33 62 71 69 65 ...
## $ product_category_name_english: Factor w/ 71 levels "agro_industry_and_commerce",..: 44 16 6 8 40 66 60 50 69 71 ...
# adjust product dataset
olist_products_dataset$product_id <- as.character(olist_products_dataset$product_id)
olist_products_dataset$product_category_name <- as.character(olist_products_dataset$product_category_name)
# adjust product category data type
product_category_name_translation$product_category_name <- as.character(product_category_name_translation$product_category_name)
# merge product data with name translation
olist_products_dataset<-merge(x=olist_products_dataset,y=product_category_name_translation, by.x ="product_category_name", by.y="product_category_name", all.x = T)
# manipulate_sellers_data
str(olist_sellers_dataset)## 'data.frame': 3095 obs. of 4 variables:
## $ seller_id : Factor w/ 3095 levels "0015a82c2db000af6aaaf3ae2ecb0532",..: 623 2541 2506 2326 982 2343 2749 322 1458 2490 ...
## $ seller_zip_code_prefix: int 13023 13844 20031 4195 12914 20920 55325 16304 1529 80310 ...
## $ seller_city : Factor w/ 611 levels "04482255","abadia de goias",..: 102 343 451 519 81 451 84 403 519 160 ...
## $ seller_state : Factor w/ 23 levels "AC","AM","BA",..: 23 23 17 23 23 17 14 23 23 16 ...
olist_sellers_dataset$seller_id <- as.character(olist_sellers_dataset$seller_id)
olist_sellers_dataset$seller_zip_code_prefix <- as.factor(olist_sellers_dataset$seller_zip_code_prefix)
# manipulate_payments_data
str(olist_order_payments_dataset)## 'data.frame': 103886 obs. of 5 variables:
## $ order_id : Factor w/ 99440 levels "00010242fe8c5a6d1ba2dd792cb16214",..: 71446 65633 14657 72396 25967 16046 46192 23843 12217 2146 ...
## $ payment_sequential : int 1 1 1 1 1 1 1 1 1 1 ...
## $ payment_type : Factor w/ 5 levels "boleto","credit_card",..: 2 2 2 2 2 2 2 2 2 1 ...
## $ payment_installments: int 8 1 1 8 2 2 1 3 6 1 ...
## $ payment_value : num 99.3 24.4 65.7 107.8 128.4 ...
olist_order_payments_dataset$order_id <- as.character(olist_order_payments_dataset$order_id)
# manipulate_order_items_data
str(olist_order_items_dataset)## 'data.frame': 112650 obs. of 7 variables:
## $ order_id : Factor w/ 98666 levels "00010242fe8c5a6d1ba2dd792cb16214",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ order_item_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ product_id : Factor w/ 32951 levels "00066f42aeeb9f3007548bb9d3f33c38",..: 8629 29598 25668 15323 22080 30848 18182 11123 6385 9013 ...
## $ seller_id : Factor w/ 3095 levels "0015a82c2db000af6aaaf3ae2ecb0532",..: 855 2679 1118 1920 2698 1224 1372 1091 1997 2231 ...
## $ shipping_limit_date: Factor w/ 93318 levels "2016-09-19 00:15:34",..: 24067 7194 45696 89769 1641 9384 39631 81823 60182 81339 ...
## $ price : num 58.9 239.9 199 13 199.9 ...
## $ freight_value : num 13.3 19.9 17.9 12.8 18.1 ...
olist_order_items_dataset$order_id <- as.character(olist_order_items_dataset$order_id)
olist_order_items_dataset$order_item_id <- as.character(olist_order_items_dataset$order_item_id)
olist_order_items_dataset$product_id <- as.character(olist_order_items_dataset$product_id)
olist_order_items_dataset$seller_id <- as.character(olist_order_items_dataset$seller_id)
olist_order_items_dataset$shipping_limit_date <- ymd_hms(olist_order_items_dataset$shipping_limit_date)
# manipulate_order_reviews
str(olist_order_reviews_dataset)## 'data.frame': 100000 obs. of 7 variables:
## $ review_id : Factor w/ 99173 levels "0001239bc1de2e33cb583967c2ca4c67",..: 48013 50030 13322 89282 95962 8159 3060 48292 63672 52194 ...
## $ order_id : Factor w/ 99441 levels "00010242fe8c5a6d1ba2dd792cb16214",..: 45052 63915 97051 39397 55031 68818 88966 75802 60275 72099 ...
## $ review_score : int 4 5 5 5 5 1 5 5 5 4 ...
## $ review_comment_title : Factor w/ 4601 levels "","-","- Luminária de Mesa pelic",..: 1 1 1 1 1 1 1 1 1 3805 ...
## $ review_comment_message : Factor w/ 36923 levels "","'entrega feita dentro do prazo",..: 1 1 1 31778 25773 1 1 1 1 4357 ...
## $ review_creation_date : Factor w/ 637 levels "2016-10-02 00:00:00",..: 413 463 442 147 454 497 230 620 170 536 ...
## $ review_answer_timestamp: Factor w/ 99010 levels "2016-10-07 18:32:28",..: 45452 56963 51957 5910 54897 66179 15081 93211 7935 75810 ...
olist_order_reviews_dataset$review_id <- as.character(olist_order_reviews_dataset$review_id)
olist_order_reviews_dataset$order_id <- as.character(olist_order_reviews_dataset$order_id)
olist_order_reviews_dataset$review_comment_title <- as.character(olist_order_reviews_dataset$review_comment_title)
olist_order_reviews_dataset$review_comment_message <- as.character(olist_order_reviews_dataset$review_comment_message)
olist_order_reviews_dataset$review_creation_date <- ymd_hms(olist_order_reviews_dataset$review_creation_date)
olist_order_reviews_dataset$review_answer_timestamp <- ymd_hms(olist_order_reviews_dataset$review_answer_timestamp)
summary(olist_order_reviews_dataset)## review_id order_id review_score
## Length:100000 Length:100000 Min. :1.000
## Class :character Class :character 1st Qu.:4.000
## Mode :character Mode :character Median :5.000
## Mean :4.071
## 3rd Qu.:5.000
## Max. :5.000
## review_comment_title review_comment_message review_creation_date
## Length:100000 Length:100000 Min. :2016-10-02 00:00:00
## Class :character Class :character 1st Qu.:2017-09-23 00:00:00
## Mode :character Mode :character Median :2018-02-02 00:00:00
## Mean :2018-01-12 17:58:10
## 3rd Qu.:2018-05-15 00:00:00
## Max. :2018-08-31 00:00:00
## review_answer_timestamp
## Min. :2016-10-07 18:32:28
## 1st Qu.:2017-09-27 01:19:37
## Median :2018-02-04 19:31:06
## Mean :2018-01-15 21:30:46
## 3rd Qu.:2018-05-20 11:00:14
## Max. :2018-10-29 12:27:35
## 'data.frame': 99441 obs. of 8 variables:
## $ order_id : Factor w/ 99441 levels "00010242fe8c5a6d1ba2dd792cb16214",..: 88951 32546 27770 57386 67044 63543 7553 39236 46083 89697 ...
## $ customer_id : Factor w/ 99441 levels "00012a2ce6f8dcda20d059ce98491703",..: 61761 68730 25514 96584 53774 31118 92191 60518 95387 19282 ...
## $ order_status : Factor w/ 8 levels "approved","canceled",..: 4 4 4 4 4 4 5 4 4 4 ...
## $ order_purchase_timestamp : Factor w/ 98875 levels "2016-09-04 21:15:19",..: 27602 90668 94566 35023 55222 15891 6386 9743 667 18535 ...
## $ order_approved_at : Factor w/ 90734 levels "","2016-09-15 12:16:38",..: 26910 83755 86887 33880 52143 15447 6368 9419 707 18033 ...
## $ order_delivered_carrier_date : Factor w/ 81019 levels "","2016-10-08 10:34:01",..: 24736 76469 78219 31487 49664 13978 1 9013 594 17711 ...
## $ order_delivered_customer_date: Factor w/ 95665 levels "","2016-10-11 13:46:32",..: 25876 88743 91784 33916 50233 15592 1 9155 597 18403 ...
## $ order_estimated_delivery_date: Factor w/ 459 levels "2016-09-30 00:00:00",..: 213 412 428 252 299 159 100 121 56 175 ...
olist_orders_dataset$order_id <- as.character(olist_orders_dataset$order_id)
olist_orders_dataset$customer_id <- as.character(olist_orders_dataset$customer_id)
olist_orders_dataset$order_purchase_timestamp <- ymd_hms(olist_orders_dataset$order_purchase_timestamp)
olist_orders_dataset$order_approved_at <- ymd_hms(olist_orders_dataset$order_approved_at)
olist_orders_dataset$order_delivered_carrier_date <- ymd_hms(olist_orders_dataset$order_delivered_carrier_date)
olist_orders_dataset$order_delivered_customer_date <- ymd_hms(olist_orders_dataset$order_delivered_customer_date)
olist_orders_dataset$order_estimated_delivery_date <- ymd_hms(olist_orders_dataset$order_estimated_delivery_date)
#--------------------------------------
summary(olist_order_reviews_dataset)## review_id order_id review_score
## Length:100000 Length:100000 Min. :1.000
## Class :character Class :character 1st Qu.:4.000
## Mode :character Mode :character Median :5.000
## Mean :4.071
## 3rd Qu.:5.000
## Max. :5.000
## review_comment_title review_comment_message review_creation_date
## Length:100000 Length:100000 Min. :2016-10-02 00:00:00
## Class :character Class :character 1st Qu.:2017-09-23 00:00:00
## Mode :character Mode :character Median :2018-02-02 00:00:00
## Mean :2018-01-12 17:58:10
## 3rd Qu.:2018-05-15 00:00:00
## Max. :2018-08-31 00:00:00
## review_answer_timestamp
## Min. :2016-10-07 18:32:28
## 1st Qu.:2017-09-27 01:19:37
## Median :2018-02-04 19:31:06
## Mean :2018-01-15 21:30:46
## 3rd Qu.:2018-05-20 11:00:14
## Max. :2018-10-29 12:27:35
olist_order <- merge(x=olist_order_items_dataset,y=olist_orders_dataset, by.x ="order_id", by.y="order_id")
olist_order <- merge(x=olist_order,y=olist_products_dataset, by.x ="product_id", by.y="product_id")
olist_order <- merge(x=olist_order,y=olist_sellers_dataset, by.x ="seller_id", by.y="seller_id")
olist_order <- merge(x=olist_order,y=olist_order_payments_dataset, by.x ="order_id", by.y="order_id")
olist_order <- merge(x=olist_order,y=olist_customers_dataset, by.x ="customer_id", by.y="customer_id")
olist_order <- merge(x=olist_order,y=olist_order_reviews_dataset, by.x ="order_id", by.y="order_id")
# View(olist_order)
order_review_payment <- merge(x=olist_orders_dataset,y=olist_order_reviews_dataset, by.x ="order_id", by.y="order_id")
order_review_payment <- merge(x=order_review_payment,y=olist_order_payments_dataset, by.x ="order_id", by.y="order_id")
olist_order_delivered <- olist_order[olist_order$order_status=="delivered",]# tabulate order frequency ofeach product category
summ_product <- as.data.frame(table(product_category_name_english = olist_order_delivered$product_category_name_english))
# sort product frequency
summ_product <- summ_product[order(summ_product$Freq, decreasing = T), ]
# get top 20 product
summ_top20_product <- summ_product[1:20,]
summary(summ_product)## product_category_name_english Freq
## agro_industry_and_commerce: 1 Min. : 2
## air_conditioning : 1 1st Qu.: 96
## art : 1 Median : 291
## arts_and_craftmanship : 1 Mean : 1607
## audio : 1 3rd Qu.: 1885
## auto : 1 Max. :11816
## (Other) :65
# visualize Top 20 Most Ordered Product Category
ggplot(summ_top20_product, aes(x=reorder(product_category_name_english, Freq), y=Freq))+
geom_col()+
coord_flip()+
labs(title = "Top 20 Most Ordered Product Category",
x = "Product",
y = "Freq") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "bottom",
legend.text = element_text(angle = 90),
legend.text.align = 1) +
geom_text(aes(label = Freq), hjust = -0.1, size = 2) +
theme_minimal()Interpretasi: Bed Bath table category adalah kategori produk dengan penjualan terbanyak, dengan 11.816 total order.
# tabulate mean of product photo quantity of each product category
mean_product_photo <- aggregate.data.frame(
list(photo_qty = olist_order_delivered$product_photos_qty),
by = list(product_category_name_english = olist_order_delivered$product_category_name_english),
mean)
# tabulate mean of product photo quantity of each product category
order_frequency <- aggregate(order_id~product_category_name_english, olist_order_delivered, length)
# merge order frequency and mean of product photo quantity of each product
order_photo_freq <- merge(x=mean_product_photo, y=order_frequency)
order_photo_freq <- order_photo_freq[order(order_photo_freq$order_id,decreasing = T),]
# Visualize descriptive analytic between Order Frequency and Photo Quantity
ggplot(order_photo_freq, aes(photo_qty, order_id)) +
geom_boxplot() +
geom_jitter(aes(col = order_photo_freq$order_id)) +
labs(title = "Relationship between Order Frequency and Photo Quantity",
x = "Mean Photo Quantity",
y = "Order Frequency",
col = "Order Frequency")Interpretasi: Berdasarkan visualisasi sebaran data di atas, dapat ditarik kesimpulan bahwa tidak ada korelasi antara photo quantity dan jumlah order.
# Relationship between products purchase with review score for bed_bath_table product category ----
# calculate duration of days from product purchased and product received by customer
order_review_payment$time_purchased_delivered <- as.numeric(date(order_review_payment$order_delivered_customer_date) - date(order_review_payment$order_purchase_timestamp))
# calculate duration of days from estimated delivery and product received by customer
order_review_payment$time_estimated_delivered <- as.numeric(date(order_review_payment$order_estimated_delivery_date) - date(order_review_payment$order_delivered_customer_date))
# generate plot to inspect relationship
plot(order_review_payment$time_purchased_delivered, order_review_payment$time_estimated_delivered)Interpretasi: Dari plot di atas, diidentifikasi bahwa terdapat korelasi negatif antara estimated delivery dan actual delivery.
# visualize Actual vs Estimated
ggplot(order_review_payment, aes(time_purchased_delivered, time_estimated_delivered)) +
geom_col(fill="darkblue") +
facet_wrap(~review_score) +
theme(strip.text = element_text(size = 7)) +
labs(title = "Actual Delivery Duration vs Estimated Deliver Duration",
x = "Actual Delivery Days",
y = "Estimated Delivery Days")Interpretasi: Berdasarkan visualisasi di atas, nilai review 5 didapatkan saat durasi estimated time tinggi, tetapi durasi actual time rendah.
set.seed(3)
customer_location_data <- olist_customers_dataset[,c("customer_unique_id","customer_zip_code_prefix")]
# sampling 10.000 location, because it's not possible to sample customer due to insuficient memory when merge customer and location data
olist_geolocation_dataset <- olist_geolocation_dataset[sample(1:10000),c("geolocation_zip_code_prefix","geolocation_lat","geolocation_lng")]
customer_location_data <- merge(x=olist_customers_dataset, y=olist_geolocation_dataset, by.x = "customer_zip_code_prefix", by.y = "geolocation_zip_code_prefix", all.x = T)
customer_location_data